In [1]:
#Downloading the Data
In [2]:
!pip install pandas
Defaulting to user installation because normal site-packages is not writeable
Requirement already satisfied: pandas in c:\programdata\anaconda3\lib\site-packages (1.5.3)
Requirement already satisfied: python-dateutil>=2.8.1 in c:\programdata\anaconda3\lib\site-packages (from pandas) (2.8.2)
Requirement already satisfied: pytz>=2020.1 in c:\programdata\anaconda3\lib\site-packages (from pandas) (2022.7)
Requirement already satisfied: numpy>=1.21.0 in c:\programdata\anaconda3\lib\site-packages (from pandas) (1.24.3)
Requirement already satisfied: six>=1.5 in c:\programdata\anaconda3\lib\site-packages (from python-dateutil>=2.8.1->pandas) (1.16.0)
In [3]:
medical_charges_url = 'https://raw.githubusercontent.com/JovianML/opendatasets/master/data/medical-charges.csv'
In [4]:
from urllib.request import urlretrieve
In [5]:
urlretrieve(medical_charges_url, 'medical.csv')
Out[5]:
('medical.csv', <http.client.HTTPMessage at 0x1c6081f00d0>)
In [6]:
import pandas as pd
medical_df = pd.read_csv('medical.csv')
medical_df
Out[6]:
age sex bmi children smoker region charges
0 19 female 27.900 0 yes southwest 16884.92400
1 18 male 33.770 1 no southeast 1725.55230
2 28 male 33.000 3 no southeast 4449.46200
3 33 male 22.705 0 no northwest 21984.47061
4 32 male 28.880 0 no northwest 3866.85520
... ... ... ... ... ... ... ...
1333 50 male 30.970 3 no northwest 10600.54830
1334 18 female 31.920 0 no northeast 2205.98080
1335 18 female 36.850 0 no southeast 1629.83350
1336 21 female 25.800 0 no southwest 2007.94500
1337 61 female 29.070 0 yes northwest 29141.36030

1338 rows × 7 columns

In [7]:
medical_df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB
In [8]:
medical_df.describe()
Out[8]:
age bmi children charges
count 1338.000000 1338.000000 1338.000000 1338.000000
mean 39.207025 30.663397 1.094918 13270.422265
std 14.049960 6.098187 1.205493 12110.011237
min 18.000000 15.960000 0.000000 1121.873900
25% 27.000000 26.296250 0.000000 4740.287150
50% 39.000000 30.400000 1.000000 9382.033000
75% 51.000000 34.693750 2.000000 16639.912515
max 64.000000 53.130000 5.000000 63770.428010
In [9]:
#Exploratory Analysis and Visualization
In [10]:
!pip install plotly 
!pip install matplotlib
!pip install seaborn 
Defaulting to user installation because normal site-packages is not writeable
Requirement already satisfied: plotly in c:\programdata\anaconda3\lib\site-packages (5.9.0)
Requirement already satisfied: tenacity>=6.2.0 in c:\programdata\anaconda3\lib\site-packages (from plotly) (8.2.2)
Defaulting to user installation because normal site-packages is not writeable
Requirement already satisfied: matplotlib in c:\programdata\anaconda3\lib\site-packages (3.7.1)
Requirement already satisfied: contourpy>=1.0.1 in c:\programdata\anaconda3\lib\site-packages (from matplotlib) (1.0.5)
Requirement already satisfied: cycler>=0.10 in c:\programdata\anaconda3\lib\site-packages (from matplotlib) (0.11.0)
Requirement already satisfied: fonttools>=4.22.0 in c:\programdata\anaconda3\lib\site-packages (from matplotlib) (4.25.0)
Requirement already satisfied: kiwisolver>=1.0.1 in c:\programdata\anaconda3\lib\site-packages (from matplotlib) (1.4.4)
Requirement already satisfied: numpy>=1.20 in c:\programdata\anaconda3\lib\site-packages (from matplotlib) (1.24.3)
Requirement already satisfied: packaging>=20.0 in c:\programdata\anaconda3\lib\site-packages (from matplotlib) (23.0)
Requirement already satisfied: pillow>=6.2.0 in c:\programdata\anaconda3\lib\site-packages (from matplotlib) (9.4.0)
Requirement already satisfied: pyparsing>=2.3.1 in c:\programdata\anaconda3\lib\site-packages (from matplotlib) (3.0.9)
Requirement already satisfied: python-dateutil>=2.7 in c:\programdata\anaconda3\lib\site-packages (from matplotlib) (2.8.2)
Requirement already satisfied: six>=1.5 in c:\programdata\anaconda3\lib\site-packages (from python-dateutil>=2.7->matplotlib) (1.16.0)
Defaulting to user installation because normal site-packages is not writeable
Requirement already satisfied: seaborn in c:\programdata\anaconda3\lib\site-packages (0.12.2)
Requirement already satisfied: numpy!=1.24.0,>=1.17 in c:\programdata\anaconda3\lib\site-packages (from seaborn) (1.24.3)
Requirement already satisfied: pandas>=0.25 in c:\programdata\anaconda3\lib\site-packages (from seaborn) (1.5.3)
Requirement already satisfied: matplotlib!=3.6.1,>=3.1 in c:\programdata\anaconda3\lib\site-packages (from seaborn) (3.7.1)
Requirement already satisfied: contourpy>=1.0.1 in c:\programdata\anaconda3\lib\site-packages (from matplotlib!=3.6.1,>=3.1->seaborn) (1.0.5)
Requirement already satisfied: cycler>=0.10 in c:\programdata\anaconda3\lib\site-packages (from matplotlib!=3.6.1,>=3.1->seaborn) (0.11.0)
Requirement already satisfied: fonttools>=4.22.0 in c:\programdata\anaconda3\lib\site-packages (from matplotlib!=3.6.1,>=3.1->seaborn) (4.25.0)
Requirement already satisfied: kiwisolver>=1.0.1 in c:\programdata\anaconda3\lib\site-packages (from matplotlib!=3.6.1,>=3.1->seaborn) (1.4.4)
Requirement already satisfied: packaging>=20.0 in c:\programdata\anaconda3\lib\site-packages (from matplotlib!=3.6.1,>=3.1->seaborn) (23.0)
Requirement already satisfied: pillow>=6.2.0 in c:\programdata\anaconda3\lib\site-packages (from matplotlib!=3.6.1,>=3.1->seaborn) (9.4.0)
Requirement already satisfied: pyparsing>=2.3.1 in c:\programdata\anaconda3\lib\site-packages (from matplotlib!=3.6.1,>=3.1->seaborn) (3.0.9)
Requirement already satisfied: python-dateutil>=2.7 in c:\programdata\anaconda3\lib\site-packages (from matplotlib!=3.6.1,>=3.1->seaborn) (2.8.2)
Requirement already satisfied: pytz>=2020.1 in c:\programdata\anaconda3\lib\site-packages (from pandas>=0.25->seaborn) (2022.7)
Requirement already satisfied: six>=1.5 in c:\programdata\anaconda3\lib\site-packages (from python-dateutil>=2.7->matplotlib!=3.6.1,>=3.1->seaborn) (1.16.0)
In [11]:
import plotly.express as px
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
In [12]:
sns.set_style('darkgrid')
matplotlib.rcParams['font.size'] = 14
matplotlib.rcParams['figure.figsize'] = (10, 6)
matplotlib.rcParams['figure.facecolor'] = '#00000000'
In [13]:
#Age

#Age is a numeric column. The minimum age in the dataset is 18 and the maximum age is 64. Thus, we can visualize the distribution of age using a histogram with 47 bins (one for each year) and a box plot.
In [14]:
medical_df.age.describe()
Out[14]:
count    1338.000000
mean       39.207025
std        14.049960
min        18.000000
25%        27.000000
50%        39.000000
75%        51.000000
max        64.000000
Name: age, dtype: float64
In [15]:
fig = px.histogram(medical_df, 
                   x='age', 
                   marginal='box', 
                   nbins=47, 
                   title='Distribution of Age')
fig.update_layout(bargap=0.1)
fig.show()
In [16]:
#Body Mass Index

#Let's look at the distribution of BMI (Body Mass Index) of customers, using a histogram and box plot.
In [17]:
fig = px.histogram(medical_df, 
                   x='bmi', 
                   marginal='box', 
                   color_discrete_sequence=['red'], 
                   title='Distribution of BMI (Body Mass Index)')
fig.update_layout(bargap=0.1)
fig.show()
In [18]:
#Charges

#Let's visualize the distribution of "charges" i.e. the annual medical charges for customers. This is the column we're trying to predict. Let's also use the categorical column "smoker" to distinguish the charges for smokers and non-smokers.
In [19]:
fig = px.histogram(medical_df, 
                   x='charges', 
                   marginal='box', 
                   color='smoker', 
                   color_discrete_sequence=['green', 'grey'], 
                   title='Annual Medical Charges')
fig.update_layout(bargap=0.1)
fig.show()
In [20]:
#Smoker

#Let's visualize the distribution of the "smoker" column (containing values "yes" and "no") using a histogram.
In [21]:
medical_df.smoker.value_counts()
Out[21]:
no     1064
yes     274
Name: smoker, dtype: int64
In [22]:
px.histogram(medical_df, x='smoker', color='sex', title='Smoker')
In [23]:
#Age and Charges

#Let's visualize the relationship between "age" and "charges" using a scatter plot. Each point in the scatter plot represents one customer. We'll also use values in the "smoker" column to color the points.
In [24]:
fig = px.scatter(medical_df, 
                 x='age', 
                 y='charges', 
                 color='smoker', 
                 opacity=0.8, 
                 hover_data=['sex'], 
                 title='Age vs. Charges')
fig.update_traces(marker_size=5)
fig.show()
In [25]:
#We can make the following observations from the above chart:

#The general trend seems to be that medical charges increase with age, as we might expect. However, there is significant variation at every age, and it's clear that age alone cannot be used to accurately determine medical charges.


#We can see three "clusters" of points, each of which seems to form a line with an increasing slope:

    # 1.The first and the largest cluster consists primary of presumably "healthy non-smokers" who have relatively low medical charges compared to others
     
    # 2. The second cluster contains a mix of smokers and non-smokers. It's possible that these are actually two distinct but overlapping clusters: "non-smokers with medical issues" and "smokers without major medical issues".
     
    # 3. The final cluster consists exclusively of smokers, presumably smokers with major medical issues that are possibly related to or worsened by smoking.
In [26]:
fig = px.scatter(medical_df, 
                 x='bmi', 
                 y='charges', 
                 color='smoker', 
                 opacity=0.8, 
                 hover_data=['sex'], 
                 title='BMI vs. Charges')
fig.update_traces(marker_size=5)
fig.show()
In [27]:
#Correlation
In [28]:
medical_df.charges.corr(medical_df.age)
Out[28]:
0.2990081933306476
In [29]:
medical_df.charges.corr(medical_df.bmi)
Out[29]:
0.19834096883362884
In [30]:
smoker_values = {'no': 0, 'yes': 1}
smoker_numeric = medical_df.smoker.map(smoker_values)
medical_df.charges.corr(smoker_numeric)
Out[30]:
0.7872514304984767
In [31]:
medical_df.corr()
C:\Users\yashg\AppData\Local\Temp\ipykernel_21820\4290363667.py:1: FutureWarning:

The default value of numeric_only in DataFrame.corr is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning.

Out[31]:
age bmi children charges
age 1.000000 0.109272 0.042469 0.299008
bmi 0.109272 1.000000 0.012759 0.198341
children 0.042469 0.012759 1.000000 0.067998
charges 0.299008 0.198341 0.067998 1.000000
In [32]:
sns.heatmap(medical_df.corr(), cmap='Reds', annot=True)
plt.title('Correlation Matrix');
C:\Users\yashg\AppData\Local\Temp\ipykernel_21820\2153061389.py:1: FutureWarning:

The default value of numeric_only in DataFrame.corr is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning.

In [33]:
#Linear Regression using a Single Feature
In [34]:
non_smoker_df = medical_df[medical_df.smoker == 'no']
In [35]:
plt.title('Age vs. Charges')
sns.scatterplot(data=non_smoker_df, x='age', y='charges', alpha=0.7, s=15);
In [36]:
#Model
#linear regression
#charges=w×age+b
In [37]:
def estimate_charges(age, w, b):
    return w * age + b
In [38]:
w = 50
b = 100
ages = non_smoker_df.age
estimated_charges = estimate_charges(ages, w, b)
In [39]:
plt.plot(ages, estimated_charges, 'r-o');
plt.xlabel('Age');
plt.ylabel('Estimated Charges');
In [40]:
target = non_smoker_df.charges

plt.plot(ages, estimated_charges, 'r', alpha=0.9);
plt.scatter(ages, target, s=8,alpha=0.8);
plt.xlabel('Age');
plt.ylabel('Charges')
plt.legend(['Estimate', 'Actual']);
In [41]:
def try_parameters(w, b):
    ages = non_smoker_df.age
    target = non_smoker_df.charges
    
    estimated_charges = estimate_charges(ages, w, b)
    
    plt.plot(ages, estimated_charges, 'r', alpha=0.9);
    plt.scatter(ages, target, s=8,alpha=0.8);
    plt.xlabel('Age');
    plt.ylabel('Charges')
    plt.legend(['Estimate', 'Actual']);
In [42]:
try_parameters(60, 200)
In [43]:
try_parameters(400, 5000)
In [44]:
#Loss/Cost Function


#Let's define a function to compute the RMSE.
In [45]:
!pip install numpy
import numpy as np
Defaulting to user installation because normal site-packages is not writeable
Requirement already satisfied: numpy in c:\programdata\anaconda3\lib\site-packages (1.24.3)
In [46]:
def rmse(targets, predictions):
    return np.sqrt(np.mean(np.square(targets - predictions)))
In [47]:
w = 50
b = 100
try_parameters(w, b)
In [48]:
targets = non_smoker_df['charges']
predicted = estimate_charges(non_smoker_df.age, w, b)
In [49]:
rmse(targets, predicted)
Out[49]:
8461.949562575493
In [50]:
def try_parameters(w, b):
    ages = non_smoker_df.age
    target = non_smoker_df.charges
    predictions = estimate_charges(ages, w, b)
    
    plt.plot(ages, predictions, 'r', alpha=0.9);
    plt.scatter(ages, target, s=8,alpha=0.8);
    plt.xlabel('Age');
    plt.ylabel('Charges')
    plt.legend(['Prediction', 'Actual']);
    
    loss = rmse(target, predictions)
    print("RMSE Loss: ", loss)
In [51]:
try_parameters(50, 100)
RMSE Loss:  8461.949562575493
In [52]:
#Linear Regression using Scikit-learn
In [53]:
!pip install scikit-learn

from sklearn.linear_model import LinearRegression

model = LinearRegression()
Defaulting to user installation because normal site-packages is not writeable
Requirement already satisfied: scikit-learn in c:\users\yashg\appdata\roaming\python\python311\site-packages (1.4.1.post1)
Requirement already satisfied: numpy<2.0,>=1.19.5 in c:\programdata\anaconda3\lib\site-packages (from scikit-learn) (1.24.3)
Requirement already satisfied: scipy>=1.6.0 in c:\programdata\anaconda3\lib\site-packages (from scikit-learn) (1.10.1)
Requirement already satisfied: joblib>=1.2.0 in c:\users\yashg\appdata\roaming\python\python311\site-packages (from scikit-learn) (1.3.2)
Requirement already satisfied: threadpoolctl>=2.0.0 in c:\programdata\anaconda3\lib\site-packages (from scikit-learn) (2.2.0)
In [54]:
inputs = non_smoker_df[['age']]
targets = non_smoker_df.charges
print('inputs.shape :', inputs.shape)
print('targes.shape :', targets.shape)
inputs.shape : (1064, 1)
targes.shape : (1064,)
In [55]:
model.fit(inputs, targets)
Out[55]:
LinearRegression()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LinearRegression()
In [56]:
model.predict(np.array([[23], 
                        [37], 
                        [61]]))
C:\Users\yashg\AppData\Roaming\Python\Python311\site-packages\sklearn\base.py:493: UserWarning:

X does not have valid feature names, but LinearRegression was fitted with feature names

Out[56]:
array([ 4055.30443855,  7796.78921819, 14210.76312614])
In [57]:
predictions = model.predict(inputs)
In [58]:
predictions
Out[58]:
array([2719.0598744 , 5391.54900271, 6727.79356686, ..., 2719.0598744 ,
       2719.0598744 , 3520.80661289])
In [59]:
rmse(targets, predictions)
Out[59]:
4662.505766636395
In [60]:
# w
model.coef_
Out[60]:
array([267.24891283])
In [61]:
# b
model.intercept_
Out[61]:
-2091.4205565650827
In [62]:
try_parameters(model.coef_, model.intercept_)
RMSE Loss:  4662.505766636395
In [63]:
# Create inputs and targets
inputs, targets = non_smoker_df[['age']], non_smoker_df['charges']

# Create and train the model
model = LinearRegression().fit(inputs, targets)

# Generate predictions
predictions = model.predict(inputs)

# Compute loss to evalute the model
loss = rmse(targets, predictions)
print('Loss:', loss)
Loss: 4662.505766636395
In [64]:
#Linear Regression using Multiple Features

#So far, we've used on the "age" feature to estimate "charges". Adding another feature like "bmi" is fairly straightforward. We simply assume the following relationship:

#charges = w_1 \times age + w_2 \times bmi + b
In [65]:
# Create inputs and targets
inputs, targets = non_smoker_df[['age', 'bmi']], non_smoker_df['charges']

# Create and train the model
model = LinearRegression().fit(inputs, targets)

# Generate predictions
predictions = model.predict(inputs)

# Compute loss to evalute the model
loss = rmse(targets, predictions)
print('Loss:', loss)
Loss: 4662.3128354612945
In [66]:
non_smoker_df.charges.corr(non_smoker_df.bmi)
Out[66]:
0.08403654312833271
In [67]:
fig = px.scatter(non_smoker_df, x='bmi', y='charges', title='BMI vs. Charges')
fig.update_traces(marker_size=5)
fig.show()
In [68]:
fig = px.scatter_3d(non_smoker_df, x='age', y='bmi', z='charges')
fig.update_traces(marker_size=3, marker_opacity=0.5)
fig.show()
In [69]:
model.coef_, model.intercept_
Out[69]:
(array([266.87657817,   7.07547666]), -2293.6320906488654)
In [70]:
non_smoker_df.charges.corr(non_smoker_df.children)
Out[70]:
0.13892870453542197
In [71]:
fig = px.strip(non_smoker_df, x='children', y='charges', title= "Children vs. Charges")
fig.update_traces(marker_size=4, marker_opacity=0.7)
fig.show()
In [72]:
# Create inputs and targets
inputs, targets = non_smoker_df[['age', 'bmi', 'children']], non_smoker_df['charges']

# Create and train the model
model = LinearRegression().fit(inputs, targets)

# Generate predictions
predictions = model.predict(inputs)

# Compute loss to evalute the model
loss = rmse(targets, predictions)
print('Loss:', loss)
Loss: 4608.470405038247
In [73]:
# Create inputs and targets
inputs, targets = medical_df[['age', 'bmi', 'children']], medical_df['charges']

# Create and train the model
model = LinearRegression().fit(inputs, targets)

# Generate predictions
predictions = model.predict(inputs)

# Compute loss to evalute the model
loss = rmse(targets, predictions)
print('Loss:', loss)
Loss: 11355.317901125973
In [74]:
#Using Categorical Features for Machine Learning

#Binary Categories
#The "smoker" category has just two values "yes" and "no". Let's create a new column "smoker_code" containing 0 for "no" and 1 for "yes".
In [75]:
sns.barplot(data=medical_df, x='smoker', y='charges');
In [76]:
smoker_codes = {'no': 0, 'yes': 1}
medical_df['smoker_code'] = medical_df.smoker.map(smoker_codes)
medical_df.charges.corr(medical_df.smoker_code)
Out[76]:
0.7872514304984767
In [77]:
medical_df
Out[77]:
age sex bmi children smoker region charges smoker_code
0 19 female 27.900 0 yes southwest 16884.92400 1
1 18 male 33.770 1 no southeast 1725.55230 0
2 28 male 33.000 3 no southeast 4449.46200 0
3 33 male 22.705 0 no northwest 21984.47061 0
4 32 male 28.880 0 no northwest 3866.85520 0
... ... ... ... ... ... ... ... ...
1333 50 male 30.970 3 no northwest 10600.54830 0
1334 18 female 31.920 0 no northeast 2205.98080 0
1335 18 female 36.850 0 no southeast 1629.83350 0
1336 21 female 25.800 0 no southwest 2007.94500 0
1337 61 female 29.070 0 yes northwest 29141.36030 1

1338 rows × 8 columns

In [78]:
# Create inputs and targets
inputs, targets = medical_df[['age', 'bmi', 'children', 'smoker_code']], medical_df['charges']

# Create and train the model
model = LinearRegression().fit(inputs, targets)

# Generate predictions
predictions = model.predict(inputs)

# Compute loss to evalute the model
loss = rmse(targets, predictions)
print('Loss:', loss)
Loss: 6056.439217188081
In [79]:
sns.barplot(data=medical_df, x='sex', y='charges')
Out[79]:
<Axes: xlabel='sex', ylabel='charges'>
In [80]:
sex_codes = {'female': 0, 'male': 1}
medical_df['sex_code'] = medical_df.sex.map(sex_codes)
medical_df.charges.corr(medical_df.sex_code)
Out[80]:
0.057292062202025366
In [81]:
# Create inputs and targets
inputs, targets = medical_df[['age', 'bmi', 'children', 'smoker_code', 'sex_code']], medical_df['charges']

# Create and train the model
model = LinearRegression().fit(inputs, targets)

# Generate predictions
predictions = model.predict(inputs)

# Compute loss to evalute the model
loss = rmse(targets, predictions)
print('Loss:', loss)
Loss: 6056.100708754546
In [82]:
#One-hot Encoding
In [83]:
sns.barplot(data=medical_df, x='region', y='charges');
In [84]:
from sklearn import preprocessing
enc = preprocessing.OneHotEncoder()
enc.fit(medical_df[['region']])
enc.categories_
Out[84]:
[array(['northeast', 'northwest', 'southeast', 'southwest'], dtype=object)]
In [85]:
one_hot = enc.transform(medical_df[['region']]).toarray()
one_hot
Out[85]:
array([[0., 0., 0., 1.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       ...,
       [0., 0., 1., 0.],
       [0., 0., 0., 1.],
       [0., 1., 0., 0.]])
In [86]:
medical_df[['northeast', 'northwest', 'southeast', 'southwest']] = one_hot
In [87]:
medical_df
Out[87]:
age sex bmi children smoker region charges smoker_code sex_code northeast northwest southeast southwest
0 19 female 27.900 0 yes southwest 16884.92400 1 0 0.0 0.0 0.0 1.0
1 18 male 33.770 1 no southeast 1725.55230 0 1 0.0 0.0 1.0 0.0
2 28 male 33.000 3 no southeast 4449.46200 0 1 0.0 0.0 1.0 0.0
3 33 male 22.705 0 no northwest 21984.47061 0 1 0.0 1.0 0.0 0.0
4 32 male 28.880 0 no northwest 3866.85520 0 1 0.0 1.0 0.0 0.0
... ... ... ... ... ... ... ... ... ... ... ... ... ...
1333 50 male 30.970 3 no northwest 10600.54830 0 1 0.0 1.0 0.0 0.0
1334 18 female 31.920 0 no northeast 2205.98080 0 0 1.0 0.0 0.0 0.0
1335 18 female 36.850 0 no southeast 1629.83350 0 0 0.0 0.0 1.0 0.0
1336 21 female 25.800 0 no southwest 2007.94500 0 0 0.0 0.0 0.0 1.0
1337 61 female 29.070 0 yes northwest 29141.36030 1 0 0.0 1.0 0.0 0.0

1338 rows × 13 columns

In [88]:
# Create inputs and targets
input_cols = ['age', 'bmi', 'children', 'smoker_code', 'sex_code', 'northeast', 'northwest', 'southeast', 'southwest']
inputs, targets = medical_df[input_cols], medical_df['charges']

# Create and train the model
model = LinearRegression().fit(inputs, targets)

# Generate predictions
predictions = model.predict(inputs)

# Compute loss to evalute the model
loss = rmse(targets, predictions)
print('Loss:', loss)
Loss: 6041.6796511744515
In [89]:
#Model Improvements

#Feature Scaling
In [90]:
model.coef_
Out[90]:
array([  256.85635254,   339.19345361,   475.50054515, 23848.53454191,
        -131.3143594 ,   587.00923503,   234.0453356 ,  -448.01281436,
        -373.04175627])
In [91]:
model.intercept_
Out[91]:
-12525.547811195444
In [92]:
weights_df = pd.DataFrame({
    'feature': np.append(input_cols, 1),
    'weight': np.append(model.coef_, model.intercept_)
})
weights_df
Out[92]:
feature weight
0 age 256.856353
1 bmi 339.193454
2 children 475.500545
3 smoker_code 23848.534542
4 sex_code -131.314359
5 northeast 587.009235
6 northwest 234.045336
7 southeast -448.012814
8 southwest -373.041756
9 1 -12525.547811
In [93]:
medical_df
Out[93]:
age sex bmi children smoker region charges smoker_code sex_code northeast northwest southeast southwest
0 19 female 27.900 0 yes southwest 16884.92400 1 0 0.0 0.0 0.0 1.0
1 18 male 33.770 1 no southeast 1725.55230 0 1 0.0 0.0 1.0 0.0
2 28 male 33.000 3 no southeast 4449.46200 0 1 0.0 0.0 1.0 0.0
3 33 male 22.705 0 no northwest 21984.47061 0 1 0.0 1.0 0.0 0.0
4 32 male 28.880 0 no northwest 3866.85520 0 1 0.0 1.0 0.0 0.0
... ... ... ... ... ... ... ... ... ... ... ... ... ...
1333 50 male 30.970 3 no northwest 10600.54830 0 1 0.0 1.0 0.0 0.0
1334 18 female 31.920 0 no northeast 2205.98080 0 0 1.0 0.0 0.0 0.0
1335 18 female 36.850 0 no southeast 1629.83350 0 0 0.0 0.0 1.0 0.0
1336 21 female 25.800 0 no southwest 2007.94500 0 0 0.0 0.0 0.0 1.0
1337 61 female 29.070 0 yes northwest 29141.36030 1 0 0.0 1.0 0.0 0.0

1338 rows × 13 columns

In [94]:
from sklearn.preprocessing import StandardScaler
numeric_cols = ['age', 'bmi', 'children'] 
scaler = StandardScaler()
scaler.fit(medical_df[numeric_cols])
Out[94]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [95]:
scaler.mean_
Out[95]:
array([39.20702541, 30.66339686,  1.09491779])
In [96]:
scaler.var_
Out[96]:
array([197.25385199,  37.16008997,   1.45212664])
In [97]:
scaled_inputs = scaler.transform(medical_df[numeric_cols])
scaled_inputs
Out[97]:
array([[-1.43876426, -0.45332   , -0.90861367],
       [-1.50996545,  0.5096211 , -0.07876719],
       [-0.79795355,  0.38330685,  1.58092576],
       ...,
       [-1.50996545,  1.0148781 , -0.90861367],
       [-1.29636188, -0.79781341, -0.90861367],
       [ 1.55168573, -0.26138796, -0.90861367]])
In [98]:
cat_cols = ['smoker_code', 'sex_code', 'northeast', 'northwest', 'southeast', 'southwest']
categorical_data = medical_df[cat_cols].values
In [99]:
inputs = np.concatenate((scaled_inputs, categorical_data), axis=1)
targets = medical_df.charges

# Create and train the model
model = LinearRegression().fit(inputs, targets)

# Generate predictions
predictions = model.predict(inputs)

# Compute loss to evalute the model
loss = rmse(targets, predictions)
print('Loss:', loss)
Loss: 6042.751556200273
In [100]:
weights_df = pd.DataFrame({
    'feature': np.append(numeric_cols + cat_cols, 1),
    'weight': np.append(model.coef_, model.intercept_)
})
In [101]:
weights_df
Out[101]:
feature weight
0 age 3.608872e+03
1 bmi 2.058490e+03
2 children 5.615551e+02
3 smoker_code 2.385276e+04
4 sex_code -1.670010e+02
5 northeast 1.758467e+17
6 northwest 1.758467e+17
7 southeast 1.758467e+17
8 southwest 1.758467e+17
9 1 -1.758467e+17
In [102]:
### Creating a Test Set
In [103]:
from sklearn.model_selection import train_test_split
In [104]:
inputs_train, inputs_test, targets_train, targets_test = train_test_split(inputs, targets, test_size=0.1)
In [105]:
# Create and train the model
model = LinearRegression().fit(inputs_train, targets_train)

# Generate predictions
predictions_test = model.predict(inputs_test)

# Compute loss to evalute the model
loss = rmse(targets_test, predictions_test)
print('Test Loss:', loss)
Test Loss: 6392.461462251641
In [106]:
# Generate predictions
predictions_train = model.predict(inputs_train)

# Compute loss to evalute the model
loss = rmse(targets_train, predictions_train)
print('Training Loss:', loss)
Training Loss: 6002.642998045587